package cn.tonyandmoney.lib.webmagic.pages;

import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.HtmlNode;
import us.codecraft.webmagic.selector.Selectable;

import java.util.List;

/**
 * Created by niantuo on 2019/2/14.
 */

public class MeePageProcessor implements PageProcessor {

    private Logger logger = LoggerFactory.getLogger(MeePageProcessor.class);

    private String articleRegex;
    private String indexUrl;


    public MeePageProcessor setPageRegex(String regex) {
        this.indexUrl = regex;
        return this;
    }

    public MeePageProcessor setArticleRegex(String regex) {
        this.articleRegex = regex;
        return this;
    }


    /**
     * 处理列表
     *
     * @param page 当前请求页面
     *             <div class="bgleft"><span>2019-01-11</span><a target="_blank" rel="noreferrer" href="http://zfs.mee.gov.cn/fl/201901/t20190111_689250.shtml" title="中华人民共和国环境噪声污染防治法">中华人民共和国环境噪声污染防治法</a></div>
     */
    @Override
    public void process(Page page) {
        logger.info("page=>{}", page.getUrl());
        String url = page.getUrl().toString();
        if (url.matches(indexUrl)) {
            Selectable selectable = page.getHtml().xpath("//div[@class='main_rt_list']");
            Selectable links = selectable.links();
            Selectable articleLinks = links.regex(articleRegex, 0);
            page.addTargetRequests(articleLinks.all());
            Selectable pageSelectable = page.getHtml().xpath("//div[@class='page']").links();
            List<String> pageLinks = pageSelectable.regex(indexUrl, 0).all();
            page.addTargetRequests(pageLinks);

        } else {
            //<div class="wzxq_biaoti2">
            // <p>中华人民共和国环境噪声污染防治法</p>
            // <span class="wzxq_fbt2"> <p>2019-01-11 </p> </span>
            // <hr>
            //</div>

            Html html = page.getHtml();
            Selectable titleable =html.css("div.wzxq_biaoti2").xpath("/div/p[1]");

            //<p>中华人民共和国环境噪声污染防治法</p>
            String title = titleable.regex("<p>(.*)</p>(.*)").toString();
            if (StringUtils.isBlank(title)){
               String tagTitle = titleable.get();
                if (StringUtils.isNotBlank(tagTitle)){
                    title=tagTitle.replaceAll("<(/)*p>","");
                }
            }



            String createTime = html.css("span.wzxq_fbt2").xpath("//p[1]").regex("<p>(.*)</p>").toString();
            Selectable contentDom = html.css("div.Custom_UnionStyle");

            if (contentDom==null||!contentDom.match()){
                contentDom = html.css("div.TRS_Editor");
            }

            if (contentDom == null||!contentDom.match()) {
                logger.error("contentDom is null !");
            }else {
                String summary = contentDom.xpath("/p").xpath("/strong").regex("<strong>(.*)</strong>").toString();
                contentDom.replace("TRS_Editor","law-editor");
                contentDom = contentDom.replace("Custom_UnionStyle", "law-content");
                //本法自2003年9月1日起施行。
                String publishTime = html.css("div.Custom_UnionStyle").regex("本法自(.*)起施行").toString();
                page.putField("content", contentDom.toString());
                page.putField("publishTime", publishTime);
                page.putField("title", title);
                page.putField("createTime", createTime);
                page.putField("summary", summary);
            }

        }


    }

    @Override
    public Site getSite() {
        return Site.me()
                .setRetryTimes(3)
                .setSleepTime(100)
                .setDomain("http://zfs.mee.gov.cn");
    }
}
